
from IPython.core.display import display, HTML, Javascript
html_contents ="""
<!DOCTYPE html>
<html lang="en">
<head>
<style>
.section2{
Font-Family: 'Playfair Display', Serif;
font-style: italic;
font-weight: bold;
background: #D9D874;
color:black;
text-align: center;
font-size:30px;
}
.section{
Font-Family: 'Source Sans Pro', Sans-Serif;
font-weight: bold;
font-size:20px;
color:#a02933;
}
.intro{
Font-Family: 'Playfair Display', Serif;
text-align:center;
font-size:15px;
color:#4e4151;
letter-spacing:0.5px;
display:fill;
background-color:#dbba78;
padding: 0.8em;
border-style: solid;
border-color:#4e4151;
border-radius:10px;
}
.final{
Font-Family: 'Playfair Display', Serif;
text-align:center;
font-size:15px;
color:#4e4151;
letter-spacing:0.5px;
display:fill;
background-color:#dbba78;
padding: 0.8em;
border-style: dashed;
border-color:#4e4151;
border-radius:10px;
}
mark {
Font-Family: 'Playfair Display', Serif;
font-style: italic;
background: #a02933;
font-weight: bold;
color: #dbba78;
text-align: center;
font-size:25px;
justify-content: center;
}
.intro2 {
padding: 1rem;
width: 600px;
0 5px 15px 0 rgba(0,0,0,0.08);
border-color:#4e4151;
border-radius:5px;
border-style: dashed;
background-color:#dbba78;
font-size:15px;
font-family: 'Playfair Display', Serif;
font-weight: none;
letter-spacing:0.5px;
text-align: left;
border-width: 2px;
color: #4e4151;
}
hr { border: 0.2px solid #a02933;
}
</style>
</head>
</html>
"""
HTML(html_contents)
from IPython.core.display import display, HTML, Javascript
# ----- Notebook Theme -----
color_map = ["#bfd3e6", "#9b5b4f", "#4e4151", "#78ccdb", "#E9C9C9", "#909195","#dc1e1e","#a02933","#716807","#E9C9C9"]
prompt = color_map[-1]
main_color = color_map[3]
strong_main_color = color_map[1]
custom_colors = [strong_main_color, main_color]
css_file = '''
div #notebook {
background-color: white;
line-height: 20px;
}
#notebook-container {
%s
margin-top: 2em;
padding-top: 2em;
border-top: 4px solid %s; /* light orange */
-webkit-box-shadow: 0px 0px 8px 2px rgba(224, 212, 226, 0.5); /* pink */
box-shadow: 0px 0px 8px 2px rgba(224, 212, 226, 0.5); /* pink */
}
div .input {
margin-bottom: 1em;
}
.rendered_html h1, .rendered_html h2, .rendered_html h3, .rendered_html h4, .rendered_html h5, .rendered_html h6 {
color: %s; /* light orange */
font-weight: 600;
}
div.input_area {
border: none;
background-color: %s; /* rgba(229, 143, 101, 0.1); light orange [exactly #E58F65] */
border-top: 2px solid %s; /* light orange */
}
div.input_prompt {
color: %s; /* light blue */
}
div.output_prompt {
color: %s; /* strong orange */
}
div.cell.selected:before, div.cell.selected.jupyter-soft-selected:before {
background: %s; /* light orange */
}
div.cell.selected, div.cell.selected.jupyter-soft-selected {
border-color: %s; /* light orange */
}
.edit_mode div.cell.selected:before {
background: %s; /* light orange */
}
.edit_mode div.cell.selected {
border-color: %s; /* light orange */
}
'''
def to_rgb(h):
return tuple(int(h[i:i+2], 16) for i in [0, 2, 4])
main_color_rgba = 'rgba(%s, %s, %s, 0.1)' % (to_rgb(main_color[1:]))
open('notebook.css', 'w').write(css_file % ('width: 95%;', main_color, main_color, main_color_rgba, main_color, main_color, prompt, main_color, main_color, main_color, main_color))
def nb():
return HTML("<style>" + open("notebook.css", "r").read() + "</style>")
nb()
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# linear algebra
import numpy as np
import pandas as pd
#viz
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mlb
import matplotlib.image as mpimg
from matplotlib.offsetbox import AnnotationBbox, OffsetImage
from IPython.display import display
import os
plt.style.use("seaborn")
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
#theme
theme = ["#DEF5E5", "#459E97", "#4B9ABB","#0A4361","#E68193",]
df=pd.read_csv("../input/medias-cost-prediction-in-foodmart/media prediction and its cost.csv")
df.head()
| food_category | food_department | food_family | store_sales(in millions) | store_cost(in millions) | unit_sales(in millions) | promotion_name | sales_country | marital_status | gender | ... | grocery_sqft | frozen_sqft | meat_sqft | coffee_bar | video_store | salad_bar | prepared_food | florist | media_type | cost | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Breakfast Foods | Frozen Foods | Food | 7.360 | 2.723 | 4.000 | Bag Stuffers | USA | M | F | ... | 18670.000 | 5415.000 | 3610.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | Daily Paper, Radio | 126.620 |
| 1 | Breakfast Foods | Frozen Foods | Food | 5.520 | 2.594 | 3.000 | Cash Register Lottery | USA | M | M | ... | 18670.000 | 5415.000 | 3610.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | Daily Paper, Radio | 59.860 |
| 2 | Breakfast Foods | Frozen Foods | Food | 3.680 | 1.362 | 2.000 | High Roller Savings | USA | S | F | ... | 18670.000 | 5415.000 | 3610.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | Daily Paper, Radio | 84.160 |
| 3 | Breakfast Foods | Frozen Foods | Food | 3.680 | 1.178 | 2.000 | Cash Register Lottery | USA | M | F | ... | 18670.000 | 5415.000 | 3610.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | In-Store Coupon | 95.780 |
| 4 | Breakfast Foods | Frozen Foods | Food | 4.080 | 1.428 | 3.000 | Double Down Sale | USA | M | M | ... | 18670.000 | 5415.000 | 3610.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | Radio | 50.790 |
5 rows × 40 columns
#describe the data with some visual style by applying T = Transpose and st#this gives us some general information on the dataset
#it shows us the count of each value, the mean of each value, standard deviation, percentiles and min/max valuesyle
df.describe().T.style.bar(subset=['mean'], color='#E68193')\
.background_gradient(subset=['std'], cmap='mako_r')\
.background_gradient(subset=['50%'], cmap='mako')
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| store_sales(in millions) | 60428.000000 | 6.541031 | 3.463047 | 0.510000 | 3.810000 | 5.940000 | 8.670000 | 22.920000 |
| store_cost(in millions) | 60428.000000 | 2.619460 | 1.453009 | 0.163200 | 1.500000 | 2.385600 | 3.484025 | 9.726500 |
| unit_sales(in millions) | 60428.000000 | 3.093169 | 0.827677 | 1.000000 | 3.000000 | 3.000000 | 4.000000 | 6.000000 |
| total_children | 60428.000000 | 2.533875 | 1.490165 | 0.000000 | 1.000000 | 3.000000 | 4.000000 | 5.000000 |
| avg_cars_at home(approx) | 60428.000000 | 2.200271 | 1.109644 | 0.000000 | 1.000000 | 2.000000 | 3.000000 | 4.000000 |
| num_children_at_home | 60428.000000 | 0.829351 | 1.303424 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 5.000000 |
| avg_cars_at home(approx).1 | 60428.000000 | 2.200271 | 1.109644 | 0.000000 | 1.000000 | 2.000000 | 3.000000 | 4.000000 |
| SRP | 60428.000000 | 2.115258 | 0.932829 | 0.500000 | 1.410000 | 2.130000 | 2.790000 | 3.980000 |
| gross_weight | 60428.000000 | 13.806433 | 4.622693 | 6.000000 | 9.700000 | 13.600000 | 17.700000 | 21.900000 |
| net_weight | 60428.000000 | 11.796289 | 4.682986 | 3.050000 | 7.710000 | 11.600000 | 16.000000 | 20.800000 |
| recyclable_package | 60428.000000 | 0.558665 | 0.496551 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
| low_fat | 60428.000000 | 0.350434 | 0.477110 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| units_per_case | 60428.000000 | 18.860694 | 10.258555 | 1.000000 | 10.000000 | 19.000000 | 28.000000 | 36.000000 |
| store_sqft | 60428.000000 | 27988.477494 | 5701.022090 | 20319.000000 | 23593.000000 | 27694.000000 | 30797.000000 | 39696.000000 |
| grocery_sqft | 60428.000000 | 19133.799696 | 3987.395735 | 13305.000000 | 16232.000000 | 18670.000000 | 22123.000000 | 30351.000000 |
| frozen_sqft | 60428.000000 | 5312.852552 | 1575.907263 | 2452.000000 | 4746.000000 | 5062.000000 | 5751.000000 | 9184.000000 |
| meat_sqft | 60428.000000 | 3541.846280 | 1050.471635 | 1635.000000 | 3164.000000 | 3375.000000 | 3834.000000 | 6122.000000 |
| coffee_bar | 60428.000000 | 0.612646 | 0.487150 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
| video_store | 60428.000000 | 0.354157 | 0.478261 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| salad_bar | 60428.000000 | 0.587956 | 0.492207 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
| prepared_food | 60428.000000 | 0.587956 | 0.492207 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
| florist | 60428.000000 | 0.562603 | 0.496069 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
| cost | 60428.000000 | 99.262366 | 30.011257 | 50.790000 | 69.650000 | 98.520000 | 126.620000 | 149.750000 |
sns.set(style="ticks", context="talk",font_scale = 1,palette="husl")
plt.figure(figsize = (8,6))
ax = df.dtypes.value_counts().plot(kind='bar',grid = False,fontsize=20)
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+ p.get_width() / 2., height + 0.2, height, ha = 'center', size = 25)
sns.despine()
msno.bar(df,figsize=(15, 5),fontsize=10,color = '#459E97');
as we see that the data is cleaned so we donn have to do any terminologies to handle it
#checking missing values
missing = df.isnull().sum()
missing = missing[missing > 0]
missing
Series([], dtype: int64)
columns = df.columns
print("******************* Numeric field *******************\n")
for i in range(len(columns)):
if df[columns[i]].dtypes!=object:
print("unique number of {} -> {}".format(columns[i], len(df[columns[i]].unique())))
print("\n******************* Categorical field *******************\n")
for i in range(len(columns)):
if df[columns[i]].dtypes==object:
print("unique number of {} -> {}".format(columns[i], len(df[columns[i]].unique())))
******************* Numeric field ******************* unique number of store_sales(in millions) -> 1033 unique number of store_cost(in millions) -> 9919 unique number of unit_sales(in millions) -> 6 unique number of total_children -> 6 unique number of avg_cars_at home(approx) -> 5 unique number of num_children_at_home -> 6 unique number of avg_cars_at home(approx).1 -> 5 unique number of SRP -> 315 unique number of gross_weight -> 376 unique number of net_weight -> 332 unique number of recyclable_package -> 2 unique number of low_fat -> 2 unique number of units_per_case -> 36 unique number of store_sqft -> 20 unique number of grocery_sqft -> 20 unique number of frozen_sqft -> 20 unique number of meat_sqft -> 20 unique number of coffee_bar -> 2 unique number of video_store -> 2 unique number of salad_bar -> 2 unique number of prepared_food -> 2 unique number of florist -> 2 unique number of cost -> 328 ******************* Categorical field ******************* unique number of food_category -> 45 unique number of food_department -> 22 unique number of food_family -> 3 unique number of promotion_name -> 49 unique number of sales_country -> 3 unique number of marital_status -> 2 unique number of gender -> 2 unique number of education -> 5 unique number of member_card -> 4 unique number of occupation -> 5 unique number of houseowner -> 2 unique number of avg. yearly_income -> 8 unique number of brand_name -> 111 unique number of store_type -> 5 unique number of store_city -> 19 unique number of store_state -> 10 unique number of media_type -> 13
unique_values = {column: list(df[column].unique()) for column in df.select_dtypes(object).columns}
for key, value in unique_values.items():
print(f'{key} : {value}')
food_category : ['Breakfast Foods', 'Bread', 'Canned Shrimp', 'Baking Goods', 'Vegetables', 'Frozen Desserts', 'Candy', 'Snack Foods', 'Dairy', 'Starchy Foods', 'Cleaning Supplies', 'Decongestants', 'Meat', 'Hot Beverages', 'Jams and Jellies', 'Carbonated Beverages', 'Seafood', 'Specialty', 'Kitchen Products', 'Electrical', 'Beer and Wine', 'Candles', 'Fruit', 'Pure Juice Beverages', 'Canned Soup', 'Paper Products', 'Canned Tuna', 'Eggs', 'Hardware', 'Canned Sardines', 'Canned Clams', 'Pain Relievers', 'Side Dishes', 'Bathroom Products', 'Magazines', 'Frozen Entrees', 'Pizza', 'Cold Remedies', 'Canned Anchovies', 'Drinks', 'Hygiene', 'Plastic Products', 'Canned Oysters', 'Packaged Vegetables', 'Miscellaneous'] food_department : ['Frozen Foods', 'Baked Goods', 'Canned Foods', 'Baking Goods', 'Produce', 'Snacks', 'Snack Foods', 'Dairy', 'Starchy Foods', 'Household', 'Health and Hygiene', 'Meat', 'Beverages', 'Seafood', 'Deli', 'Alcoholic Beverages', 'Canned Products', 'Eggs', 'Periodicals', 'Breakfast Foods', 'Checkout', 'Carousel'] food_family : ['Food', 'Non-Consumable', 'Drink'] promotion_name : ['Bag Stuffers', 'Cash Register Lottery', 'High Roller Savings', 'Double Down Sale', 'Green Light Days', 'Big Time Savings', 'Price Savers', 'Price Slashers', 'Dollar Days', 'Two Day Sale', 'Super Duper Savers', 'Weekend Markdown', 'Dollar Cutters', 'Sales Galore', 'Big Promo', 'Free For All', 'Savings Galore', 'Unbeatable Price Savers', 'Price Smashers', 'Shelf Clearing Days', 'Sales Days', 'Go For It', 'Super Savers', 'Wallet Savers', 'Save-It Sale', 'Price Destroyers', 'Two for One', 'Big Time Discounts', 'Shelf Emptiers', 'Bye Bye Baby', 'One Day Sale', 'Fantastic Discounts', 'Saving Days', 'You Save Days', 'Sale Winners', 'Coupon Spectacular', 'Three for One', 'Price Winners', 'Dimes Off', 'I Cant Believe It Sale', 'Money Savers', 'Green Light Special', 'Price Cutters', 'Tip Top Savings', 'Best Savings', 'Pick Your Savings', 'Double Your Savings', 'Mystery Sale', 'Super Wallet Savers'] sales_country : ['USA', 'Mexico', 'Canada'] marital_status : ['M', 'S'] gender : ['F', 'M'] education : ['Partial High School', 'Bachelors Degree', 'High School Degree', 'Graduate Degree', 'Partial College'] member_card : ['Normal', 'Silver', 'Bronze', 'Golden'] occupation : ['Skilled Manual', 'Professional', 'Manual', 'Management', 'Clerical'] houseowner : ['Y', 'N'] avg. yearly_income : ['$10K - $30K', '$50K - $70K', '$30K - $50K', '$70K - $90K', '$110K - $130K', '$130K - $150K', '$150K +', '$90K - $110K'] brand_name : ['Carrington', 'Golden', 'Imagine', 'Big Time', 'PigTail', 'Fantastic', 'Great', 'Sphinx', 'Modell', 'Colony', 'Blue Label', 'Pleasant', 'Bravo', 'Better', 'Just Right', 'Plato', 'BBB Best', 'Landslide', 'Super', 'CDR', 'High Top', 'Tri-State', 'Hermanos', 'Tell Tale', 'Ebony', 'Thresher', 'Gulf Coast', 'Musial', 'Atomic', 'Choice', 'Fort West', 'Nationeel', 'Horatio', 'Best Choice', 'Fast', 'Gorilla', 'Carlson', 'Even Better', 'Club', 'Booker', 'Shady Lake', 'Monarch', 'Discover', 'Colossal', 'Medalist', 'Jardon', 'Cormorant', 'Sunset', 'Red Wing', 'High Quality', 'Denny', 'Faux Products', 'Steady', 'Consolidated', 'Bird Call', 'Hilltop', 'Ship Shape', 'Footnote', 'Genteel', 'Quick', 'Gerolli', 'Excellent', 'Fabulous', 'Token', 'Skinner', 'Washington', 'Dual City', 'Kiwi', 'Tip Top', 'Amigo', 'Curlew', 'Moms', 'Cutting Edge', 'Red Spade', 'Lake', 'American', 'Walrus', 'Pearl', 'Good', 'Top Measure', 'Portsmouth', 'Toucan', 'Applause', 'Swell', 'Green Ribbon', 'Big City', 'National', 'Blue Medal', 'Urban', 'Jumbo', 'Giant', 'Dollar', 'Mighty Good', 'Robust', 'Gauss', 'Excel', 'Radius', 'Best', 'Jeffers', 'Johnson', 'Special', 'Akron', 'Framton', 'Black Tie', 'Queen', 'James Bay', 'Toretti', 'Prelude', 'Symphony', 'ADJ', 'King'] store_type : ['Deluxe Supermarket', 'Supermarket', 'Gourmet Supermarket', 'Small Grocery', 'Mid-Size Grocery'] store_city : ['Salem', 'Tacoma', 'Seattle', 'Spokane', 'Los Angeles', 'Beverly Hills', 'Portland', 'Bellingham', 'Orizaba', 'Merida', 'Hidalgo', 'Mexico City', 'Vancouver', 'Bremerton', 'Camacho', 'Guadalajara', 'Acapulco', 'San Francisco', 'Victoria'] store_state : ['OR', 'WA', 'CA', 'Veracruz', 'Yucatan', 'Zacatecas', 'DF', 'BC', 'Jalisco', 'Guerrero'] media_type : ['Daily Paper, Radio', 'In-Store Coupon', 'Radio', 'Daily Paper', 'Product Attachment', 'Sunday Paper, Radio', 'Sunday Paper, Radio, TV', 'Sunday Paper', 'Street Handout', 'TV', 'Bulk Mail', 'Cash Register Handout', 'Daily Paper, Radio, TV']
#so as we see from the unique values of categorial features that we wnna make some text manipulation
for col in df.columns:
if '$' in df[col].to_string():
df[col + '_no_dollar'] = df[col].str.replace('$', '').str.replace(',', '')
df["media_type"].value_counts()
Daily Paper, Radio 6820 Product Attachment 5371 Daily Paper, Radio, TV 5284 Daily Paper 5119 Street Handout 5069 Radio 4980 Sunday Paper 4859 In-Store Coupon 4495 Sunday Paper, Radio 4050 Cash Register Handout 4002 TV 3576 Bulk Mail 3457 Sunday Paper, Radio, TV 3346 Name: media_type, dtype: int64
df["low_fat"].unique()
array([0., 1.])
df.columns
Index(['food_category', 'food_department', 'food_family',
'store_sales(in millions)', 'store_cost(in millions)',
'unit_sales(in millions)', 'promotion_name', 'sales_country',
'marital_status', 'gender', 'total_children', 'education',
'member_card', 'occupation', 'houseowner', 'avg_cars_at home(approx)',
'avg. yearly_income', 'num_children_at_home',
'avg_cars_at home(approx).1', 'brand_name', 'SRP', 'gross_weight',
'net_weight', 'recyclable_package', 'low_fat', 'units_per_case',
'store_type', 'store_city', 'store_state', 'store_sqft', 'grocery_sqft',
'frozen_sqft', 'meat_sqft', 'coffee_bar', 'video_store', 'salad_bar',
'prepared_food', 'florist', 'media_type', 'cost',
'avg. yearly_income_no_dollar'],
dtype='object')
so these outlairs that we wanna handle
numeric_data=df[["store_sales(in millions)","store_cost(in millions)","meat_sqft","frozen_sqft","unit_sales(in millions)"]]
numeric_data.plot(subplots =True, kind = 'box', layout = (5,8), figsize = (30,30),patch_artist=True,color="#459E97")
plt.subplots_adjust(wspace = 0.5)
plt.style.use("ggplot")
plt.show()
def out_lairs(col):
"""
this function take the name of column and compute the median(Q2)represent 50% percentage
and also compute the Q3(75%)from the boxblot then compute the inter_quantile_range
and then we compute the upper limit and the lowerlimit to show the origin of the outlairs
"""
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3-Q1
data_out = df.loc[(df[col] < (Q1-1.5*IQR)) | (df[col] > (Q3+1.5*IQR))][:10]
return data_out
out_lairs("store_sales(in millions)")
| food_category | food_department | food_family | store_sales(in millions) | store_cost(in millions) | unit_sales(in millions) | promotion_name | sales_country | marital_status | gender | ... | frozen_sqft | meat_sqft | coffee_bar | video_store | salad_bar | prepared_food | florist | media_type | cost | avg. yearly_income_no_dollar | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 72 | Breakfast Foods | Frozen Foods | Food | 16.200 | 5.346 | 5.000 | Two for One | USA | S | F | ... | 4923.000 | 3282.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | Daily Paper, Radio, TV | 57.520 | 50K - 70K |
| 102 | Breakfast Foods | Frozen Foods | Food | 16.200 | 6.156 | 5.000 | Weekend Markdown | USA | S | M | ... | 2452.000 | 1635.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | Daily Paper | 140.570 | 50K - 70K |
| 138 | Breakfast Foods | Frozen Foods | Food | 16.200 | 5.508 | 5.000 | Two for One | Mexico | M | M | ... | 4819.000 | 3213.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | Cash Register Handout | 123.630 | 130K - 150K |
| 171 | Breakfast Foods | Frozen Foods | Food | 16.200 | 5.508 | 5.000 | Price Slashers | Canada | S | M | ... | 4016.000 | 2678.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | Sunday Paper, Radio, TV | 77.240 | 70K - 90K |
| 371 | Bread | Baked Goods | Food | 19.650 | 6.288 | 5.000 | Dollar Cutters | USA | M | M | ... | 4746.000 | 3164.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | Daily Paper, Radio, TV | 144.180 | 30K - 50K |
| 384 | Bread | Baked Goods | Food | 18.700 | 6.919 | 5.000 | Shelf Clearing Days | USA | M | M | ... | 4746.000 | 3164.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | Radio | 99.770 | 10K - 30K |
| 407 | Bread | Baked Goods | Food | 19.300 | 7.141 | 5.000 | Two for One | USA | M | M | ... | 4923.000 | 3282.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | Daily Paper, Radio, TV | 57.520 | 30K - 50K |
| 457 | Bread | Baked Goods | Food | 18.700 | 6.358 | 5.000 | Two for One | USA | M | M | ... | 4923.000 | 3282.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | Daily Paper, Radio, TV | 57.520 | 130K - 150K |
| 673 | Bread | Baked Goods | Food | 19.650 | 8.253 | 5.000 | Money Savers | Mexico | M | M | ... | 6393.000 | 4262.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | TV | 65.700 | 130K - 150K |
| 850 | Bread | Baked Goods | Food | 18.700 | 6.545 | 5.000 | Price Slashers | Canada | M | F | ... | 4016.000 | 2678.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | Sunday Paper, Radio, TV | 77.240 | 30K - 50K |
10 rows × 41 columns
mask = np.zeros_like(df.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)]= True
fig, ax = plt.subplots(figsize=(35, 27))
heatmap = sns.heatmap(df.corr(),
mask = mask,
square = True,
linewidths = .7,
cmap = 'PuBuGn',
cbar_kws = {'shrink': .8,"ticks" : [-1, -0.5, 0, 0.5, 1]},
vmin = -1,
vmax = 1,
annot = True,
annot_kws = {"size": 10})
#add the column names as labels
ax.set_yticklabels(df.corr(), rotation = 0)
ax.set_xticklabels(df.corr())
sns.set_style({'xtick.bottom': True}, {'ytick.left': True})
ax.annotate('correlation',
fontsize=10,fontweight='bold',
xy=(2.3, 4.2), xycoords='data',
xytext=(0.6, 0.95), textcoords='axes fraction',
arrowprops=dict(
facecolor=theme[4], shrink=0.025,
connectionstyle='arc3, rad=0.80'),
horizontalalignment='left', verticalalignment='top'
)
ax.set_title('Correlation of Features', size = 50, color = theme[3], weight='bold', pad=40)
Text(0.5, 1.0, 'Correlation of Features')
I choose the thresholds with 0.8 and -0.8 so features that greater than 0.8 or lesser than -0.8 are considered as strong positive correlation and strong negative correlation respectively
meat_sqft and frozen_sqft have strong correlation with 1
store_sales and store_cost also so a high correlation with 0.95 and grocery_sqft with store_sqft for 0.91
SRP has high correlation with store_sales and store_cost for 0.83 and 0.8 respectivvely
In conclusion, I chosed to remove salad_bar, gross_weight, avg_cars_at home(approx).1, store_sales, store_cost, meat_sqft, store_sqft
numeric_data=df.select_dtypes(exclude=["object"])
numeric_feature = df.dtypes!=object
final_numeric_feature = df.columns[numeric_feature].tolist()
from scipy.stats import shapiro
for i in range(len(final_numeric_feature)):
stat,p = shapiro(df[final_numeric_feature[i]])
print(final_numeric_feature[i])
print('P_value=%.3f' % (p))
print("*******************************")
store_sales(in millions) P_value=0.000 ******************************* store_cost(in millions) P_value=0.000 ******************************* unit_sales(in millions) P_value=0.000 ******************************* total_children P_value=0.000 ******************************* avg_cars_at home(approx) P_value=0.000 ******************************* num_children_at_home P_value=0.000 ******************************* avg_cars_at home(approx).1 P_value=0.000 ******************************* SRP P_value=0.000 ******************************* gross_weight P_value=0.000 ******************************* net_weight P_value=0.000 ******************************* recyclable_package P_value=0.000 ******************************* low_fat P_value=0.000 ******************************* units_per_case P_value=0.000 ******************************* store_sqft P_value=0.000 ******************************* grocery_sqft P_value=0.000 ******************************* frozen_sqft P_value=0.000 ******************************* meat_sqft P_value=0.000 ******************************* coffee_bar P_value=0.000 ******************************* video_store P_value=0.000 ******************************* salad_bar P_value=0.000 ******************************* prepared_food P_value=0.000 ******************************* florist P_value=0.000 ******************************* cost P_value=0.000 *******************************
In Shapiro-Wilk Normality Test : H0 : The data are normally distributed
According to the P_value obtained from the Shapiro test (p_value < 5%), we reject our null hypothesis (H0) --> So the numerical fields do not follow the normal distribution.
as we see that we have skewness and outlairs lets analyisse the origin of these outalirs to see hoe we are gonna hadle it
numeric_data=df.select_dtypes(exclude=["object"])
numeric_data.shape
(60428, 23)
plt.figure(figsize = (30,30))
for ax, col in enumerate(numeric_data.columns[:24]):
plt.subplot(6,4, ax + 1)
plt.title(col)
plotting = sns.kdeplot(x = numeric_data[col],fill=True, common_norm=False, color="#E68193",alpha=.9, linewidth=3);
plt.tight_layout()
plt.title('Distribution')
plotting.figure.suptitle(' Distribution of features ',y=1.08, size = 26, color = theme[3], weight='bold');
we can see here that there are categorial variables but in numerical features and we have a skewed fetures also
H0: There are differences between lst_0 and lst_1 aka mean(lst_0) == mean(lst_1)
Ha: There are NO differences between lst0 and lst_1 aka mean(lst_0) != mean(lst_1)
from scipy.stats import ttest_ind
lst_cate = [ 'recyclable_package','low_fat', 'coffee_bar', 'video_store', 'prepared_food', 'florist']
alpha = .05
for i in lst_cate:
lst_0 = df.cost[df[i] == 0]
lst_1 = df.cost[df[i] == 1]
t, p = ttest_ind(lst_0, lst_1, equal_var = False)
#--------------------------------------------------------------------------------
print(f'p-value = {p:.2f}, alpha = {alpha:.2f}')
if (p < alpha):
print('p < alpha => reject H0 => there are differences between', i, 'and cost')
else:
print('p > alpha => failed to reject H0 => there are NO differences between', i, 'and cost')
print('\n')
p-value = 0.67, alpha = 0.05 p > alpha => failed to reject H0 => there are NO differences between recyclable_package and cost p-value = 0.30, alpha = 0.05 p > alpha => failed to reject H0 => there are NO differences between low_fat and cost p-value = 0.00, alpha = 0.05 p < alpha => reject H0 => there are differences between coffee_bar and cost p-value = 0.00, alpha = 0.05 p < alpha => reject H0 => there are differences between video_store and cost p-value = 0.00, alpha = 0.05 p < alpha => reject H0 => there are differences between prepared_food and cost p-value = 0.00, alpha = 0.05 p < alpha => reject H0 => there are differences between florist and cost
df["unit_sales"]=df["store_sales(in millions)"]-df["store_cost(in millions)"]
df["unit_sales"]
0 4.637
1 2.926
2 2.318
3 2.502
4 2.652
...
60423 1.435
60424 1.104
60425 2.981
60426 5.713
60427 4.968
Name: unit_sales, Length: 60428, dtype: float64
The main goal of this content is find the most features that affect to the prediction of cost to acquire a customer via media so I will focus on EDA and inferential analytics if necessary to explore the features
#from dataprep.datasets import load_dataset
#from dataprep.eda import create_report
#report=create_report(df)
#report.save("projectcost")
#report
UNI_VARIATE_ANALYSIS
import plotly.express as px
division_rat = px.pie(df, names='total_children', values='total_children', hole=0.6, title='Overall total children',
color_discrete_sequence=px.colors.qualitative.T10)
division_rat.update_traces(textfont=dict(color='#fff'))
division_rat.update_layout(autosize=True, height=300, width=800,
margin=dict(t=80, b=30, l=70, r=40),
plot_bgcolor='#2d3035', paper_bgcolor='#2d3035',
title_font=dict(size=25, color='#a5a7ab', family="Muli, sans-serif"),
font=dict(color='#8a8d93'),
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
)
df["total_children"].value_counts()
2.000 12518 4.000 12427 3.000 11921 1.000 11770 5.000 6168 0.000 5624 Name: total_children, dtype: int64
import plotly.express as px
division_rat = px.pie(df, names='unit_sales(in millions)', values='unit_sales(in millions)', hole=0.7, title='Overall unit_sales',
color_discrete_sequence=px.colors.qualitative.T10)
division_rat.update_traces(textfont=dict(color='#fff'))
division_rat.update_layout(autosize=True, height=300, width=500,
margin=dict(t=80, b=30, l=70, r=40),
plot_bgcolor='#2d3035', paper_bgcolor='#2d3035',
title_font=dict(size=25, color='#a5a7ab', family="Muli, sans-serif"),
font=dict(color='#8a8d93'),
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
)
BIVARIATE_ANALYSIS
data=df.groupby("food_category")[["unit_sales(in millions)"]].sum().sort_values(by=['unit_sales(in millions)'],ascending=[False]).reset_index()
px.bar(data, x= 'food_category', y="unit_sales(in millions)",color='food_category',color_discrete_sequence=px.colors.qualitative.Bold)
grp_product = df.groupby('food_category')
sales_by_product = grp_product['unit_sales'].mean().reset_index()
sales_by_product = sales_by_product.sort_values(by='unit_sales', ascending=False)
sales_by_product[:10]
| food_category | unit_sales | |
|---|---|---|
| 37 | Plastic Products | 4.839 |
| 34 | Pain Relievers | 4.698 |
| 11 | Canned Shrimp | 4.669 |
| 7 | Canned Anchovies | 4.657 |
| 19 | Drinks | 4.351 |
| 21 | Electrical | 4.320 |
| 43 | Starchy Foods | 4.243 |
| 29 | Kitchen Products | 4.225 |
| 23 | Frozen Entrees | 4.224 |
| 41 | Snack Foods | 4.135 |
plt.figure(figsize=(15,6))
plt.barh(sales_by_product[:10]['food_category'], sales_by_product[:10]['unit_sales'])
plt.xticks(rotation='vertical', size=8)
plt.xlabel('Sales in millions')
plt.ylabel('Product')
plt.title('Top 10 Products With Most Sales')
for index, value in enumerate(sales_by_product[:10]['unit_sales']):
plt.text(value, index,
str(value))
plt.show()
as we see that the (pain relievers and plastic products and canned shrimp) achieve the highest net sales
supermaret_store = df[df['store_type']=="Supermarket"]
duluxe_supermaret_store = df[df['store_type']=="Deluxe Supermarket"]
Gourmet_supermaret_store = df[df['store_type']=="Gourmet Supermarket"]
#Mid_Size_Grocery_store = data[data['store_type']=="Mid-Size Grocery"]
#Small_Grocery_store = data[data['store_type']=="Small Grocery"]
stores=["Supermarket","Deluxe Supermarket","Gourmet Supermarket"]
fig, ax = plt.subplots(1,3, figsize=(36,10))
for i,store in enumerate([supermaret_store , duluxe_supermaret_store, Gourmet_supermaret_store]):
fig.sca(ax[i])
df_category_quantity = store.groupby('food_category')['unit_sales'].mean().reset_index()
df_category_quantity.sort_values(by='unit_sales', ascending=False)
plt.barh(df_category_quantity[:10]['food_category'], df_category_quantity[:10]['unit_sales'])
plt.xticks(rotation='vertical', size=20)
plt.yticks(size=20)
plt.xlabel('Quantity')
plt.ylabel('Category')
plt.title('net salary of of each food category in Each super market')
plt.title(stores[i], fontsize=20,color='brown')
fig.suptitle('net salary of of each food category in Each super market',fontsize=20, color='brown', y=.95)
Text(0.5, 0.95, 'net salary of of each food category in Each super market')
in the three highest supermarket sales the highest food category is cannedanchovies
food = df['food_family'].value_counts().reset_index()
food
| index | food_family | |
|---|---|---|
| 0 | Food | 43284 |
| 1 | Non-Consumable | 11573 |
| 2 | Drink | 5571 |
classname2 = px.histogram(df, x='food_family',
title='COUNT OF FOOD_FAMILY', height=300,width=600
,color_discrete_sequence=['#03DAC5'],
)
classname2.update_yaxes(showgrid=False),
classname2.update_xaxes(categoryorder='total descending')
classname2.update_traces(hovertemplate=None)
classname2.update_layout(margin=dict(t=100, b=0, l=70, r=40),
hovermode="x unified",
xaxis_tickangle=360,
xaxis_title=' ', yaxis_title=" ",
plot_bgcolor='#2d3035', paper_bgcolor='#2d3035',
title_font=dict(size=25, color='#a5a7ab', family="Muli, sans-serif"),
font=dict(color='#8a8d93'),
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
)
as we see that the highest food family is food bc the most things the food mart selled is food.
classname = px.histogram(df, x='food_family', color='marital_status',
title='FOOD_FAMILY BASED ON MARITAL STATUS',height=300,width=600,
category_orders={'marital_status': ['S', 'M']},
color_discrete_sequence=['#DB6574', '#03DAC5'],
)
classname.update_yaxes(showgrid=False),
classname.update_xaxes(categoryorder='total descending')
classname.update_traces(hovertemplate=None)
classname.update_layout(margin=dict(t=100, b=0, l=70, r=40),
hovermode="x unified",
xaxis_tickangle=360,
xaxis_title=' ', yaxis_title=" ",
plot_bgcolor='#2d3035', paper_bgcolor='#2d3035',
title_font=dict(size=25, color='#a5a7ab', family="Muli, sans-serif"),
font=dict(color='#8a8d93'),
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
)
in food ⁉ married is less than single
in non_consumble states: married is high than single
in Drink: married is high than single
top_region = df.groupby('sales_country')['unit_sales'].sum().sort_values(ascending=False).head(10)
top_region = top_region.reset_index()
import plotly.graph_objects as go
fig = go.Figure(go.Pie(labels=top_region.sales_country,
values = top_region.unit_sales,
customdata = top_region.unit_sales,
hovertemplate = "Region:%{label} <br> Sales: %{customdata} <extra></extra>",
pull= [0.1,0,0,0.1],
))
fig.update_layout(
title={
'text': "<b>Sales in different Regions",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
)
fig.show()
as we see above that USA has the highest net salaries than the othe countries cause it has the most num stores
df_tier1=pd.DataFrame(df.loc[df['sales_country'] =='USA'])#for tier 1
df_tier2=pd.DataFrame(df.loc[df['sales_country'] =='Mexico']) #for tier 2
df_tier3=pd.DataFrame(df.loc[df['sales_country'] =='Canada']) #for tier 3
#df_cities=pd.concat([df_tier2,df_tier3],axis=0) #combining dfs for tier 2 & 3
df_tier1_list=df_tier1.groupby("food_department")[["cost"]].mean().sort_values(by=['cost'],ascending=[False]).reset_index()
df_tier1_list.rename(columns = {'food_department':'Items_Tier_1','cost':'Sales_of_usa'}, inplace = True)
df_tier1_list
| Items_Tier_1 | Sales_of_usa | |
|---|---|---|
| 0 | Checkout | 101.306 |
| 1 | Breakfast Foods | 99.904 |
| 2 | Canned Foods | 99.501 |
| 3 | Meat | 99.470 |
| 4 | Deli | 99.273 |
| 5 | Carousel | 99.064 |
| 6 | Snack Foods | 98.970 |
| 7 | Baked Goods | 98.867 |
| 8 | Eggs | 98.854 |
| 9 | Frozen Foods | 98.837 |
| 10 | Starchy Foods | 98.732 |
| 11 | Seafood | 98.709 |
| 12 | Baking Goods | 98.463 |
| 13 | Dairy | 98.454 |
| 14 | Canned Products | 98.154 |
| 15 | Alcoholic Beverages | 98.097 |
| 16 | Health and Hygiene | 97.798 |
| 17 | Beverages | 97.792 |
| 18 | Produce | 97.749 |
| 19 | Household | 97.711 |
| 20 | Snacks | 96.917 |
| 21 | Periodicals | 96.263 |
df_tier2_list=df_tier2.groupby("food_department")[["cost"]].mean().sort_values(by=['cost'],ascending=[False]).reset_index()
df_tier2_list.rename(columns = {'food_department':'Items_Tier2','cost':'Sales_of_Mexico'}, inplace = True)
df_tier2_list
| Items_Tier2 | Sales_of_Mexico | |
|---|---|---|
| 0 | Periodicals | 102.395 |
| 1 | Canned Foods | 102.146 |
| 2 | Beverages | 101.972 |
| 3 | Baking Goods | 101.732 |
| 4 | Eggs | 101.351 |
| 5 | Health and Hygiene | 101.145 |
| 6 | Household | 101.129 |
| 7 | Snack Foods | 100.796 |
| 8 | Frozen Foods | 100.785 |
| 9 | Alcoholic Beverages | 100.699 |
| 10 | Snacks | 100.682 |
| 11 | Deli | 100.627 |
| 12 | Seafood | 100.572 |
| 13 | Carousel | 100.197 |
| 14 | Starchy Foods | 100.112 |
| 15 | Produce | 100.029 |
| 16 | Baked Goods | 99.903 |
| 17 | Canned Products | 99.871 |
| 18 | Dairy | 99.188 |
| 19 | Meat | 99.084 |
| 20 | Breakfast Foods | 98.168 |
| 21 | Checkout | 97.591 |
df_tier3_list=df_tier3.groupby("food_department")[["cost"]].mean().sort_values(by=['cost'],ascending=[False]).reset_index()
df_tier3_list.rename(columns = {'food_department':'Items_Tier3','cost':'Sales_of_canda'}, inplace = True)
df_tier3_list
| Items_Tier3 | Sales_of_canda | |
|---|---|---|
| 0 | Snacks | 109.589 |
| 1 | Periodicals | 105.818 |
| 2 | Canned Products | 104.617 |
| 3 | Health and Hygiene | 103.927 |
| 4 | Baked Goods | 103.757 |
| 5 | Breakfast Foods | 102.233 |
| 6 | Produce | 101.861 |
| 7 | Snack Foods | 101.245 |
| 8 | Frozen Foods | 101.232 |
| 9 | Deli | 101.201 |
| 10 | Checkout | 101.075 |
| 11 | Eggs | 100.990 |
| 12 | Alcoholic Beverages | 100.619 |
| 13 | Household | 99.459 |
| 14 | Beverages | 99.355 |
| 15 | Dairy | 99.327 |
| 16 | Baking Goods | 99.213 |
| 17 | Starchy Foods | 98.701 |
| 18 | Canned Foods | 98.652 |
| 19 | Carousel | 98.185 |
| 20 | Meat | 96.982 |
| 21 | Seafood | 95.221 |
df_sales=pd.concat([df_tier1_list,df_tier2_list,df_tier3_list],axis=1)
df_sales
| Items_Tier_1 | Sales_of_usa | Items_Tier2 | Sales_of_Mexico | Items_Tier3 | Sales_of_canda | |
|---|---|---|---|---|---|---|
| 0 | Checkout | 101.306 | Periodicals | 102.395 | Snacks | 109.589 |
| 1 | Breakfast Foods | 99.904 | Canned Foods | 102.146 | Periodicals | 105.818 |
| 2 | Canned Foods | 99.501 | Beverages | 101.972 | Canned Products | 104.617 |
| 3 | Meat | 99.470 | Baking Goods | 101.732 | Health and Hygiene | 103.927 |
| 4 | Deli | 99.273 | Eggs | 101.351 | Baked Goods | 103.757 |
| 5 | Carousel | 99.064 | Health and Hygiene | 101.145 | Breakfast Foods | 102.233 |
| 6 | Snack Foods | 98.970 | Household | 101.129 | Produce | 101.861 |
| 7 | Baked Goods | 98.867 | Snack Foods | 100.796 | Snack Foods | 101.245 |
| 8 | Eggs | 98.854 | Frozen Foods | 100.785 | Frozen Foods | 101.232 |
| 9 | Frozen Foods | 98.837 | Alcoholic Beverages | 100.699 | Deli | 101.201 |
| 10 | Starchy Foods | 98.732 | Snacks | 100.682 | Checkout | 101.075 |
| 11 | Seafood | 98.709 | Deli | 100.627 | Eggs | 100.990 |
| 12 | Baking Goods | 98.463 | Seafood | 100.572 | Alcoholic Beverages | 100.619 |
| 13 | Dairy | 98.454 | Carousel | 100.197 | Household | 99.459 |
| 14 | Canned Products | 98.154 | Starchy Foods | 100.112 | Beverages | 99.355 |
| 15 | Alcoholic Beverages | 98.097 | Produce | 100.029 | Dairy | 99.327 |
| 16 | Health and Hygiene | 97.798 | Baked Goods | 99.903 | Baking Goods | 99.213 |
| 17 | Beverages | 97.792 | Canned Products | 99.871 | Starchy Foods | 98.701 |
| 18 | Produce | 97.749 | Dairy | 99.188 | Canned Foods | 98.652 |
| 19 | Household | 97.711 | Meat | 99.084 | Carousel | 98.185 |
| 20 | Snacks | 96.917 | Breakfast Foods | 98.168 | Meat | 96.982 |
| 21 | Periodicals | 96.263 | Checkout | 97.591 | Seafood | 95.221 |
import plotly.graph_objects as go #importing the library
x=df_sales['Items_Tier_1']
y1=df_sales['Sales_of_usa']
y2=df_sales['Sales_of_Mexico']
y3=df_sales['Sales_of_canda']
# Create traces
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y1,
mode='lines+markers',line_color='rgb(0,100,80)',
name='Sales_of_usa'))
fig.add_trace(go.Scatter(x=x, y=y2,
mode='lines+markers',
name='Sales_of_Mexico'))
fig.add_trace(go.Scatter(x=x, y=y3,
mode='lines+markers',
name='Sales_of_canda'))
fig.show()
as we see from above that CANADA has the highest net sales for every food department expect from Dairy food department become to decrease
and in canda we can see that it has the lowest net salaries comparing to the other countries expect snacks
in MEXICO we see that it is in the middle of salaries expect Backing Goods and and begin to have the highest salaries to periodicals
df_tier1_list=df_tier1.groupby("food_department")[["unit_sales"]].mean().sort_values(by=['unit_sales'],ascending=[False]).reset_index()
df_tier1_list.rename(columns = {'food_department':'Items_Tier_1','unit_sales':'Sales_of_usa'}, inplace = True)
df_tier1_list
| Items_Tier_1 | Sales_of_usa | |
|---|---|---|
| 0 | Starchy Foods | 4.325 |
| 1 | Snack Foods | 4.125 |
| 2 | Household | 4.102 |
| 3 | Seafood | 4.029 |
| 4 | Dairy | 4.025 |
| 5 | Produce | 3.994 |
| 6 | Eggs | 3.969 |
| 7 | Checkout | 3.969 |
| 8 | Baked Goods | 3.946 |
| 9 | Breakfast Foods | 3.937 |
| 10 | Snacks | 3.917 |
| 11 | Meat | 3.912 |
| 12 | Deli | 3.904 |
| 13 | Canned Foods | 3.802 |
| 14 | Frozen Foods | 3.794 |
| 15 | Alcoholic Beverages | 3.793 |
| 16 | Periodicals | 3.772 |
| 17 | Health and Hygiene | 3.768 |
| 18 | Beverages | 3.740 |
| 19 | Baking Goods | 3.537 |
| 20 | Carousel | 3.517 |
| 21 | Canned Products | 3.324 |
df_tier2_list=df_tier2.groupby("food_department")[["unit_sales"]].mean().sort_values(by=['unit_sales'],ascending=[False]).reset_index()
df_tier2_list.rename(columns = {'food_department':'Items_Tier2','unit_sales':'Sales_of_Mexico'}, inplace = True)
df_tier2_list
| Items_Tier2 | Sales_of_Mexico | |
|---|---|---|
| 0 | Snack Foods | 4.164 |
| 1 | Household | 4.162 |
| 2 | Produce | 4.041 |
| 3 | Dairy | 4.025 |
| 4 | Seafood | 4.013 |
| 5 | Starchy Foods | 3.978 |
| 6 | Snacks | 3.972 |
| 7 | Eggs | 3.961 |
| 8 | Periodicals | 3.939 |
| 9 | Baked Goods | 3.925 |
| 10 | Beverages | 3.885 |
| 11 | Canned Foods | 3.873 |
| 12 | Frozen Foods | 3.865 |
| 13 | Alcoholic Beverages | 3.849 |
| 14 | Deli | 3.840 |
| 15 | Meat | 3.835 |
| 16 | Checkout | 3.772 |
| 17 | Breakfast Foods | 3.762 |
| 18 | Health and Hygiene | 3.708 |
| 19 | Baking Goods | 3.596 |
| 20 | Canned Products | 3.483 |
| 21 | Carousel | 3.327 |
df_tier3_list=df_tier3.groupby("food_department")[["unit_sales"]].mean().sort_values(by=['unit_sales'],ascending=[False]).reset_index()
df_tier3_list.rename(columns = {'food_department':'Items_Tier2','unit_sales':'Sales_of_canada'}, inplace = True)
df_tier3_list
| Items_Tier2 | Sales_of_canada | |
|---|---|---|
| 0 | Eggs | 5.074 |
| 1 | Starchy Foods | 4.527 |
| 2 | Snacks | 4.261 |
| 3 | Household | 4.235 |
| 4 | Meat | 4.166 |
| 5 | Snack Foods | 4.108 |
| 6 | Deli | 4.013 |
| 7 | Baked Goods | 3.996 |
| 8 | Produce | 3.959 |
| 9 | Breakfast Foods | 3.927 |
| 10 | Checkout | 3.899 |
| 11 | Frozen Foods | 3.898 |
| 12 | Periodicals | 3.892 |
| 13 | Canned Foods | 3.836 |
| 14 | Beverages | 3.815 |
| 15 | Health and Hygiene | 3.798 |
| 16 | Baking Goods | 3.796 |
| 17 | Dairy | 3.780 |
| 18 | Alcoholic Beverages | 3.693 |
| 19 | Canned Products | 3.526 |
| 20 | Carousel | 3.381 |
| 21 | Seafood | 3.082 |
df_sales=pd.concat([df_tier1_list,df_tier2_list,df_tier3_list],axis=1)
df_sales
| Items_Tier_1 | Sales_of_usa | Items_Tier2 | Sales_of_Mexico | Items_Tier2 | Sales_of_canada | |
|---|---|---|---|---|---|---|
| 0 | Starchy Foods | 4.325 | Snack Foods | 4.164 | Eggs | 5.074 |
| 1 | Snack Foods | 4.125 | Household | 4.162 | Starchy Foods | 4.527 |
| 2 | Household | 4.102 | Produce | 4.041 | Snacks | 4.261 |
| 3 | Seafood | 4.029 | Dairy | 4.025 | Household | 4.235 |
| 4 | Dairy | 4.025 | Seafood | 4.013 | Meat | 4.166 |
| 5 | Produce | 3.994 | Starchy Foods | 3.978 | Snack Foods | 4.108 |
| 6 | Eggs | 3.969 | Snacks | 3.972 | Deli | 4.013 |
| 7 | Checkout | 3.969 | Eggs | 3.961 | Baked Goods | 3.996 |
| 8 | Baked Goods | 3.946 | Periodicals | 3.939 | Produce | 3.959 |
| 9 | Breakfast Foods | 3.937 | Baked Goods | 3.925 | Breakfast Foods | 3.927 |
| 10 | Snacks | 3.917 | Beverages | 3.885 | Checkout | 3.899 |
| 11 | Meat | 3.912 | Canned Foods | 3.873 | Frozen Foods | 3.898 |
| 12 | Deli | 3.904 | Frozen Foods | 3.865 | Periodicals | 3.892 |
| 13 | Canned Foods | 3.802 | Alcoholic Beverages | 3.849 | Canned Foods | 3.836 |
| 14 | Frozen Foods | 3.794 | Deli | 3.840 | Beverages | 3.815 |
| 15 | Alcoholic Beverages | 3.793 | Meat | 3.835 | Health and Hygiene | 3.798 |
| 16 | Periodicals | 3.772 | Checkout | 3.772 | Baking Goods | 3.796 |
| 17 | Health and Hygiene | 3.768 | Breakfast Foods | 3.762 | Dairy | 3.780 |
| 18 | Beverages | 3.740 | Health and Hygiene | 3.708 | Alcoholic Beverages | 3.693 |
| 19 | Baking Goods | 3.537 | Baking Goods | 3.596 | Canned Products | 3.526 |
| 20 | Carousel | 3.517 | Canned Products | 3.483 | Carousel | 3.381 |
| 21 | Canned Products | 3.324 | Carousel | 3.327 | Seafood | 3.082 |
import plotly.graph_objects as go #importing the library
x=df_sales['Items_Tier_1']
y1=df_sales['Sales_of_usa']
y2=df_sales['Sales_of_Mexico']
y3=df_sales['Sales_of_canada']
# Create traces
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y1,
mode='lines+markers',line_color='rgb(0,100,80)',
name='Sales_of_usa'))
fig.add_trace(go.Scatter(x=x, y=y2,
mode='lines+markers',
name='Sales_of_Mexico'))
fig.add_trace(go.Scatter(x=x, y=y3,
mode='lines+markers',
name='Sales_of_canda'))
fig.show()
as we see from above that CANADA has the highest net sales for starchy food and snack food department except from canned products department become to decrease
top_region = df.groupby('store_city')['unit_sales'].mean().sort_values(ascending=False).head(10)
top_region = top_region.reset_index()
fig = go.Figure(go.Pie(labels=top_region.store_city,
values = top_region.unit_sales,
customdata = top_region.unit_sales,
hovertemplate = "Region:%{label} <br> Sales: %{customdata} <extra></extra>",
pull= [0.1,0,0,0.1],
))
fig.update_layout(
title={
'text': "<b>Sales in different Regions",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
)
fig.show()
the highest two cities according to net sales is portland and merida
df_tier1=pd.DataFrame(df.loc[df['store_city'] =='Tacoma'])#for tier 1
df_tier2=pd.DataFrame(df.loc[df['store_city'] =='Portland']) #for tier 2
df_tier3=pd.DataFrame(df.loc[df['store_city'] =='Seattle']) #for tier 3
df_cities=pd.concat([df_tier2,df_tier3],axis=0) #combining dfs for tier 2 & 3
df_tier1_list=df_tier1.groupby("food_department")[["unit_sales"]].sum().sort_values(by=['unit_sales'],ascending=[False]).reset_index()
df_tier1_list.rename(columns = {'food_department':'Items_Tier_1','unit_sales':'Sales_tier_1'}, inplace = True)
df_cities_list=df_cities.groupby("food_department")[["unit_sales"]].sum().sort_values(by=['unit_sales'],ascending=[False]).reset_index()
df_cities_list.rename(columns = {'food_department':'Items_Tier2_3','unit_sales':'Sales_tier2_3'}, inplace = True)
df3_sales=pd.concat([df_tier1_list,df_cities_list],axis=1)
df3_sales
| Items_Tier_1 | Sales_tier_1 | Items_Tier2_3 | Sales_tier2_3 | |
|---|---|---|---|---|
| 0 | Produce | 3310.340 | Produce | 5848.674 |
| 1 | Snack Foods | 2588.700 | Snack Foods | 5080.282 |
| 2 | Household | 2424.999 | Household | 4384.266 |
| 3 | Frozen Foods | 2351.870 | Frozen Foods | 4028.983 |
| 4 | Baking Goods | 1607.119 | Canned Foods | 2729.122 |
| 5 | Canned Foods | 1551.089 | Dairy | 2726.800 |
| 6 | Dairy | 1473.343 | Baking Goods | 2671.489 |
| 7 | Health and Hygiene | 1410.649 | Health and Hygiene | 2550.390 |
| 8 | Beverages | 1120.764 | Deli | 1998.781 |
| 9 | Deli | 976.781 | Beverages | 1756.473 |
| 10 | Baked Goods | 708.268 | Alcoholic Beverages | 1220.921 |
| 11 | Snacks | 618.862 | Baked Goods | 1169.418 |
| 12 | Alcoholic Beverages | 533.222 | Snacks | 939.734 |
| 13 | Starchy Foods | 413.745 | Starchy Foods | 914.263 |
| 14 | Eggs | 374.990 | Eggs | 738.982 |
| 15 | Periodicals | 374.792 | Breakfast Foods | 557.957 |
| 16 | Breakfast Foods | 266.940 | Periodicals | 545.851 |
| 17 | Seafood | 141.617 | Seafood | 278.238 |
| 18 | Checkout | 132.766 | Meat | 257.669 |
| 19 | Canned Products | 125.018 | Canned Products | 227.515 |
| 20 | Meat | 115.076 | Checkout | 219.303 |
| 21 | Carousel | 67.462 | Carousel | 152.418 |
import plotly.graph_objects as go #importing the library
x=df3_sales['Items_Tier_1']
y1=df3_sales['Sales_tier_1']
y2=df3_sales['Sales_tier2_3']
# Create traces
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y1,
mode='lines+markers',line_color='rgb(0,100,80)',
name='Tacoma sales'))
fig.add_trace(go.Scatter(x=x, y=y2,
mode='lines+markers',
name='Portland AND Seattle sales'))
fig.show()
df_tier1_list=df_tier1.groupby("food_department")[["cost"]].sum().sort_values(by=['cost'],ascending=[False]).reset_index()
df_tier1_list.rename(columns = {'food_department':'Items_Tier_1','cost':'Sales_tier_1'}, inplace = True)
df_cities_list=df_cities.groupby("food_department")[["cost"]].sum().sort_values(by=['cost'],ascending=[False]).reset_index()
df_cities_list.rename(columns = {'food_department':'Items_Tier2_3','cost':'Sales_tier2_3'}, inplace = True)
df_sales=pd.concat([df_tier1_list,df_cities_list],axis=1)
df_sales
| Items_Tier_1 | Sales_tier_1 | Items_Tier2_3 | Sales_tier2_3 | |
|---|---|---|---|---|
| 0 | Produce | 78877.630 | Produce | 148866.540 |
| 1 | Frozen Foods | 60671.430 | Snack Foods | 124831.490 |
| 2 | Snack Foods | 60426.050 | Frozen Foods | 111532.940 |
| 3 | Household | 55489.220 | Household | 107777.820 |
| 4 | Baking Goods | 44282.790 | Baking Goods | 76650.790 |
| 5 | Canned Foods | 39356.360 | Canned Foods | 76555.460 |
| 6 | Dairy | 36122.980 | Health and Hygiene | 66211.650 |
| 7 | Health and Hygiene | 35746.410 | Dairy | 66183.500 |
| 8 | Beverages | 28394.320 | Deli | 50831.340 |
| 9 | Deli | 25747.330 | Beverages | 49155.040 |
| 10 | Baked Goods | 17832.200 | Alcoholic Beverages | 31319.450 |
| 11 | Snacks | 14829.280 | Baked Goods | 31018.590 |
| 12 | Alcoholic Beverages | 13648.780 | Snacks | 24451.440 |
| 13 | Periodicals | 9489.290 | Starchy Foods | 20801.030 |
| 14 | Starchy Foods | 9416.340 | Eggs | 18162.190 |
| 15 | Eggs | 8808.910 | Periodicals | 14483.540 |
| 16 | Breakfast Foods | 7266.300 | Breakfast Foods | 14237.040 |
| 17 | Canned Products | 3780.770 | Seafood | 7373.220 |
| 18 | Seafood | 3407.390 | Meat | 6568.490 |
| 19 | Meat | 3298.440 | Canned Products | 6476.770 |
| 20 | Checkout | 3121.800 | Checkout | 6150.020 |
| 21 | Carousel | 1533.160 | Carousel | 4710.220 |
import plotly.graph_objects as go #importing the library
x=df_sales['Items_Tier_1']
y1=df_sales['Sales_tier_1']
y2=df_sales['Sales_tier2_3']
# Create traces
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y1,
mode='lines+markers',line_color='rgb(0,100,80)',
name='Tacoma sales'))
fig.add_trace(go.Scatter(x=x, y=y2,
mode='lines+markers',
name='Portland AND Seattle sales'))
fig.show()
store_type_relation = df.groupby("store_type")[["unit_sales"]].mean().sort_values(by=['unit_sales'],ascending=[False]).reset_index()
px.bar(store_type_relation, x= 'store_type', y="unit_sales",color='store_type',color_discrete_sequence=px.colors.qualitative.Pastel1)
duluxe_supermaret_store["promotion_name"].value_counts()
Cash Register Lottery 1555 Price Savers 1274 Double Down Sale 1158 Super Duper Savers 1090 Bag Stuffers 1009 Price Winners 975 Dimes Off 926 One Day Sale 870 Money Savers 812 I Cant Believe It Sale 780 Price Slashers 762 Green Light Days 723 Big Promo 682 Super Savers 639 Two Day Sale 626 Save-It Sale 620 Tip Top Savings 616 Sales Galore 578 Free For All 560 Sale Winners 532 Two for One 518 Shelf Clearing Days 506 Go For It 460 Big Time Savings 438 Pick Your Savings 424 Three for One 405 Dollar Days 404 Big Time Discounts 391 Price Cutters 282 Price Smashers 278 Bye Bye Baby 263 Price Destroyers 258 Saving Days 257 High Roller Savings 228 Dollar Cutters 223 Weekend Markdown 220 Sales Days 220 Green Light Special 200 Best Savings 192 Name: promotion_name, dtype: int64
supermaret_store = df[df['store_type']=="Supermarket"]
duluxe_supermaret_store = df[df['store_type']=="Deluxe Supermarket"]
Gourmet_supermaret_store = df[df['store_type']=="Gourmet Supermarket"]
#Mid_Size_Grocery_store = data[data['store_type']=="Mid-Size Grocery"]
#Small_Grocery_store = data[data['store_type']=="Small Grocery"]
stores=["Supermarket","Deluxe Supermarket","Gourmet Supermarket"]
fig, ax = plt.subplots(1,3, figsize=(30,10))
for i,store in enumerate([supermaret_store , duluxe_supermaret_store, Gourmet_supermaret_store]):
fig.sca(ax[i])
df_category_quantity = store.groupby('promotion_name')['unit_sales'].sum().reset_index()
df_category_quantity.sort_values(by='unit_sales', ascending=False)
plt.barh(df_category_quantity[:10]['promotion_name'], df_category_quantity[:10]['unit_sales'])
plt.xticks(rotation='vertical', size=20)
plt.yticks(size=20)
plt.xlabel('Quantity')
plt.ylabel('Category')
#plt.title(stores[i], fontsize=15,color='brown')
#plt.title('net salary of of each poromotion in Each super market')
plt.title(stores[i], fontsize=18,color='brown')
fig.suptitle('net salary of of each poromotion in Each super market',fontsize=25, color='brown', y=.98)
Text(0.5, 0.98, 'net salary of of each poromotion in Each super market')
data=df[df["store_type"]=="Supermarket"]
data["store_city"].value_counts()
Portland 5150 Seattle 5051 Spokane 4453 Los Angeles 3960 Bremerton 3451 Orizaba 2621 Acapulco 1506 Name: store_city, dtype: int64
df["store_type"].value_counts()
Supermarket 26192 Deluxe Supermarket 22954 Gourmet Supermarket 6503 Mid-Size Grocery 2846 Small Grocery 1933 Name: store_type, dtype: int64
df_category_quantity = df.groupby('promotion_name')['unit_sales'].sum().reset_index()
df_category_quantity.sort_values(by='unit_sales', ascending=False)
plt.figure(figsize=(12,6))
plt.barh(df_category_quantity[:10]['promotion_name'], df_category_quantity[:10]['unit_sales'])
plt.xticks(rotation='vertical', size=8)
plt.xlabel('Quantity')
plt.ylabel('poromotion')
plt.title('the highest promotion based on sales')
for index, value in enumerate(df_category_quantity[:10]['unit_sales']):
plt.text(value, index,
str(value))
plt.show()
supermaret_store = df[df['store_type']=="Supermarket"]
duluxe_supermaret_store = df[df['store_type']=="Deluxe Supermarket"]
Gourmet_supermaret_store = df[df['store_type']=="Gourmet Supermarket"]
#Mid_Size_Grocery_store = data[data['store_type']=="Mid-Size Grocery"]
#Small_Grocery_store = data[data['store_type']=="Small Grocery"]
stores=["Supermarket","Deluxe Supermarket","Gourmet Supermarket"]
fig, ax = plt.subplots(1,3, figsize=(36,10))
for i,store in enumerate([supermaret_store , duluxe_supermaret_store, Gourmet_supermaret_store]):
fig.sca(ax[i])
df_category_quantity = store.groupby('food_department')['unit_sales'].sum().reset_index()
df_category_quantity.sort_values(by='unit_sales', ascending=False)
plt.barh(df_category_quantity[:10]['food_department'], df_category_quantity[:10]['unit_sales'])
plt.xticks(rotation='vertical', size=20)
plt.yticks(size=20)
plt.xlabel('Quantity')
plt.ylabel('Category')
plt.title('net salary of of each food department in Each super market')
plt.title(stores[i], fontsize=20,color='brown')
fig.suptitle('net salary of of each food department in Each super market',fontsize=20, color='brown', y=.95)
Text(0.5, 0.95, 'net salary of of each food department in Each super market')
food_department_relation = df.groupby("food_department")[["unit_sales"]].sum().sort_values(by=['unit_sales'],ascending=[False]).reset_index()
px.bar(food_department_relation, x= 'food_department', y="unit_sales",color='food_department',color_discrete_sequence=px.colors.qualitative.Set3)
the best five department is produce, snake foods , houshold , frozen foods and canned foods
member_card with unit_sales
df_category_quantity = df.groupby('member_card')['unit_sales'].sum().reset_index()
df_category_quantity.sort_values(by='unit_sales', ascending=False)
plt.figure(figsize=(12,6))
plt.barh(df_category_quantity[:10]['member_card'], df_category_quantity[:10]['unit_sales'])
plt.xticks(rotation='vertical', size=8)
plt.xlabel('Quantity')
plt.ylabel('Category')
plt.title('Number of Quantities Sold in Each Category')
for index, value in enumerate(df_category_quantity[:10]['unit_sales']):
plt.text(value, index,
str(value))
plt.show()
df_category_quantity = df.groupby('food_department')['unit_sales(in millions)'].sum().reset_index()
df_category_quantity.sort_values(by='unit_sales(in millions)', ascending=False)
plt.figure(figsize=(12,6))
plt.barh(df_category_quantity[:10]['food_department'], df_category_quantity[:10]['unit_sales(in millions)'])
plt.xticks(rotation='vertical', size=8)
plt.xlabel('Quantity')
plt.ylabel('Category')
plt.title('Number of Quantities Sold in Each Category')
for index, value in enumerate(df_category_quantity[:10]['unit_sales(in millions)']):
plt.text(value, index,
str(value))
plt.show()
the baking goods department achieve the highest unit sales overall the all department
most_SALARY=df.groupby(["food_category"])["store_sales(in millions)"].max().sort_values(ascending=False)[:10]
most_SALARY
food_category Fruit 22.920 Canned Soup 19.900 Pain Relievers 19.900 Dairy 19.900 Vegetables 19.850 Hygiene 19.850 Snack Foods 19.800 Jams and Jellies 19.800 Kitchen Products 19.750 Drinks 19.750 Name: store_sales(in millions), dtype: float64
location and sales
pd.set_option('display.float_format', lambda x: '%.3f' % x)
outlet_size_sales = df.groupby("member_card")[["cost"]].sum().reset_index()
outlet_size_sales.sort_values(by=['cost'],ascending=[False])
| member_card | cost | |
|---|---|---|
| 0 | Bronze | 3358180.070 |
| 2 | Normal | 1375474.260 |
| 1 | Golden | 746772.690 |
| 3 | Silver | 517799.240 |
data = df.groupby("member_card")[["unit_sales"]].sum().sort_values(by=['unit_sales'],ascending=[False]).reset_index()
px.bar(data, x= 'member_card', y="unit_sales",color='member_card',color_discrete_sequence=px.colors.qualitative.Set3)
yearly income with net_salary?
df_category_quantity = df.groupby('avg. yearly_income')['unit_sales'].sum().reset_index()
df_category_quantity.sort_values(by='unit_sales', ascending=False)
plt.figure(figsize=(12,6))
plt.barh(df_category_quantity[:10]['avg. yearly_income'], df_category_quantity[:10]['unit_sales'])
plt.xticks(rotation='vertical', size=8)
plt.xlabel('Quantity')
plt.ylabel('Category')
plt.title('Number of Quantities Sold in Each Category')
for index, value in enumerate(df_category_quantity[:10]['unit_sales']):
plt.text(value, index,
str(value))
plt.show()
store_type_relation = df.groupby("media_type")[["cost"]].mean().sort_values(by=['cost'],ascending=[False]).reset_index()
px.bar(store_type_relation, x= 'media_type', y="cost",color='media_type',color_discrete_sequence=px.colors.qualitative.Pastel1)
store_type_relation = df.groupby("media_type")[["unit_sales"]].mean().sort_values(by=['unit_sales'],ascending=[False]).reset_index()
px.bar(store_type_relation, x= 'media_type', y="unit_sales",color='media_type',color_discrete_sequence=px.colors.qualitative.Pastel1)
store_type_relation = df.groupby("media_type")[["cost"]].mean().sort_values(by=['cost'],ascending=[False]).reset_index()
px.bar(store_type_relation, x= 'media_type', y="cost",color='media_type',color_discrete_sequence=px.colors.qualitative.Pastel1)
usa_store = df[df['sales_country']=="USA"]
Mexico_store = df[df['sales_country']=="Mexico"]
Canada_store = df[df['sales_country']=="Canada"]
#Mid_Size_Grocery_store = data[data['store_type']=="Mid-Size Grocery"]
#Small_Grocery_store = data[data['store_type']=="Small Grocery"]
stores=["USA"," Mexico","canada"]
fig, ax = plt.subplots(1,3, figsize=(36,10))
for i,store in enumerate([usa_store , Mexico_store, Canada_store]):
fig.sca(ax[i])
df_category_quantity = store.groupby('media_type')['cost'].mean().reset_index()
df_category_quantity.sort_values(by='cost', ascending=False)
plt.barh(df_category_quantity[:10]['media_type'], df_category_quantity[:10]['cost'])
plt.xticks(rotation='vertical', size=8)
plt.yticks(size=20)
plt.xlabel('Quantity')
plt.ylabel('Category')
plt.title('net salary of of each poromotion in Each super market')
plt.title(stores[i], fontsize=20,color='brown')
fig.suptitle('the highest media type according to cost in each country',fontsize=20, color='brown', y=.95)
Text(0.5, 0.95, 'the highest media type according to cost in each country')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype
import plotly.express as px
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import mutual_info_regression
def make_mi_score(x,y):
x = x.copy()
for colname in x.select_dtypes(['object','category']):
x[colname], _ = x[colname].factorize()
discrete_features = [pd.api.types.is_integer_dtype(t) for t in x.dtypes]
mi_score = mutual_info_regression(x,y,discrete_features=discrete_features,random_state=0)
mi_score = pd.Series(mi_score,name='Mutual Information Score',index=x.columns)
mi_score = mi_score.sort_values(ascending=False)
return mi_score
x = df.copy()
y = x.pop('cost')
mi_score = make_mi_score(x,y)
mi_score
promotion_name 3.776 store_sqft 2.734 grocery_sqft 2.732 frozen_sqft 2.731 meat_sqft 2.730 store_city 2.715 media_type 2.541 store_state 1.924 store_type 1.156 sales_country 0.819 florist 0.664 prepared_food 0.615 salad_bar 0.612 coffee_bar 0.603 video_store 0.583 avg. yearly_income_no_dollar 0.099 avg. yearly_income 0.099 unit_sales(in millions) 0.074 total_children 0.073 num_children_at_home 0.070 avg_cars_at home(approx) 0.062 avg_cars_at home(approx).1 0.059 education 0.055 occupation 0.053 member_card 0.041 gender 0.017 houseowner 0.014 marital_status 0.012 unit_sales 0.010 store_cost(in millions) 0.002 units_per_case 0.002 food_category 0.000 low_fat 0.000 recyclable_package 0.000 net_weight 0.000 gross_weight 0.000 food_department 0.000 brand_name 0.000 store_sales(in millions) 0.000 food_family 0.000 SRP 0.000 Name: Mutual Information Score, dtype: float64
def plot_mi_score(score):
score = score.sort_values(ascending = True)
width = np.arange(len(score))
ticks = list(score.index)
plt.figure(figsize=(14,14))
plt.barh(width,score)
plt.yticks(width,ticks)
plt.title('Mutual Information Score')
plot_mi_score(mi_score)
#["food_category",'food_department','food_family','promotion_name','sales_country','marital_status','gender','education','member_card','occupation','houseowner','avg. yearly_income','brand_name','store_type','store_city','store_state','media_type']
food_category_encoder = LabelEncoder()
df['food_category']= food_category_encoder.fit_transform(df['food_category'])
food_department_encoder = LabelEncoder()
df['food_department']= food_department_encoder.fit_transform(df['food_department'])
food_family_encoder = LabelEncoder()
df['food_family']= food_family_encoder.fit_transform(df['food_family'])
promotion_name_encoder = LabelEncoder()
df['promotion_name']= promotion_name_encoder.fit_transform(df['promotion_name'])
sales_country_encoder = LabelEncoder()
df['sales_country']= sales_country_encoder.fit_transform(df['sales_country'])
marital_status_encoder = LabelEncoder()
df['marital_status']= marital_status_encoder.fit_transform(df['marital_status'])
gender_encoder = LabelEncoder()
df['gender']= gender_encoder.fit_transform(df['gender'])
education_encoder = LabelEncoder()
df['education']= education_encoder.fit_transform(df['education'])
member_card_encoder = LabelEncoder()
df['member_card']= member_card_encoder.fit_transform(df['member_card'])
occupation_encoder = LabelEncoder()
df['occupation']= occupation_encoder.fit_transform(df['occupation'])
houseowner_encoder = LabelEncoder()
df['houseowner']= houseowner_encoder.fit_transform(df['houseowner'])
yearly_income_encoder = LabelEncoder()
df['avg. yearly_income_no_dollar']= yearly_income_encoder.fit_transform(df['avg. yearly_income_no_dollar'])
brand_name_encoder = LabelEncoder()
df['brand_name']= brand_name_encoder.fit_transform(df['brand_name'])
store_type_encoder = LabelEncoder()
df['store_type']= store_type_encoder.fit_transform(df['store_type'])
store_city_encoder = LabelEncoder()
df['store_city']= store_city_encoder.fit_transform(df['store_city'])
store_state_encoder = LabelEncoder()
df['store_state']= store_state_encoder.fit_transform(df['store_state'])
media_type_encoder = LabelEncoder()
df['media_type']= media_type_encoder.fit_transform(df['media_type'])
#final_data = encoder.fit_transform(df.drop(columns='cost'))
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 60428 entries, 0 to 60427 Data columns (total 42 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 food_category 60428 non-null int64 1 food_department 60428 non-null int64 2 food_family 60428 non-null int64 3 store_sales(in millions) 60428 non-null float64 4 store_cost(in millions) 60428 non-null float64 5 unit_sales(in millions) 60428 non-null float64 6 promotion_name 60428 non-null int64 7 sales_country 60428 non-null int64 8 marital_status 60428 non-null int64 9 gender 60428 non-null int64 10 total_children 60428 non-null float64 11 education 60428 non-null int64 12 member_card 60428 non-null int64 13 occupation 60428 non-null int64 14 houseowner 60428 non-null int64 15 avg_cars_at home(approx) 60428 non-null float64 16 avg. yearly_income 60428 non-null object 17 num_children_at_home 60428 non-null float64 18 avg_cars_at home(approx).1 60428 non-null float64 19 brand_name 60428 non-null int64 20 SRP 60428 non-null float64 21 gross_weight 60428 non-null float64 22 net_weight 60428 non-null float64 23 recyclable_package 60428 non-null float64 24 low_fat 60428 non-null float64 25 units_per_case 60428 non-null float64 26 store_type 60428 non-null int64 27 store_city 60428 non-null int64 28 store_state 60428 non-null int64 29 store_sqft 60428 non-null float64 30 grocery_sqft 60428 non-null float64 31 frozen_sqft 60428 non-null float64 32 meat_sqft 60428 non-null float64 33 coffee_bar 60428 non-null float64 34 video_store 60428 non-null float64 35 salad_bar 60428 non-null float64 36 prepared_food 60428 non-null float64 37 florist 60428 non-null float64 38 media_type 60428 non-null int64 39 cost 60428 non-null float64 40 avg. yearly_income_no_dollar 60428 non-null int64 41 unit_sales 60428 non-null float64 dtypes: float64(24), int64(17), object(1) memory usage: 19.4+ MB
x = df.drop(columns=['cost',"unit_sales","avg. yearly_income","SRP","food_category","low_fat","net_weight","food_department","store_cost(in millions)","brand_name","gross_weight","net_weight"])
y = df['cost']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
x_train.shape
(42299, 31)
lr = LinearRegression()
lr.fit(x_train,y_train)
print('Attempting to fit Linear Regressor')
Attempting to fit Linear Regressor
y_pred_val_lr = lr.predict(x_test)
print('MAE on Validation set :',metrics.mean_absolute_error(y_test, y_pred_val_lr))
print("\n")
print('MSE on Validation set :',metrics.mean_squared_error(y_test, y_pred_val_lr))
print("\n")
print('RMSE on Validation set :',np.sqrt(metrics.mean_absolute_error(y_test, y_pred_val_lr)))
print("\n")
print('R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_lr))
print("\n")
MAE on Validation set : 25.446287942525657 MSE on Validation set : 876.016344656049 RMSE on Validation set : 5.044431379504102 R2 Score on Validation set : 0.03246551904513406
svm = SVR()
svm.fit(x_train,y_train)
print('Attempting to fit Support Vector Regressor')
Attempting to fit Support Vector Regressor
y_pred_val_svm = svm.predict(x_test)
print('MAE on Validation set :',metrics.mean_absolute_error(y_test, y_pred_val_svm))
print("\n")
print('MSE on Validation set :',metrics.mean_squared_error(y_test, y_pred_val_svm))
print("\n")
print('RMSE on Validation set :',np.sqrt(metrics.mean_absolute_error(y_test, y_pred_val_svm)))
print("\n")
print('R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_svm))
print("\n")
MAE on Validation set : 25.430824008612543 MSE on Validation set : 888.7690687910297 RMSE on Validation set : 5.042898373813669 R2 Score on Validation set : 0.018380507501720822
dc = DecisionTreeRegressor(random_state = 0)
dc.fit(x_train,y_train)
print('Attempting to fit Decision Tree Regressor')
Attempting to fit Decision Tree Regressor
y_pred_val_dc = dc.predict(x_test)
print('MAE on Validation set :',metrics.mean_absolute_error(y_test, y_pred_val_dc))
print("\n")
print('MSE on Validation set :',metrics.mean_squared_error(y_test, y_pred_val_dc))
print("\n")
print('RMSE on Validation set :',np.sqrt(metrics.mean_absolute_error(y_test, y_pred_val_dc)))
print("\n")
print('R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_dc))
print("\n")
MAE on Validation set : 0.04107231507548138 MSE on Validation set : 1.5291222902531858 RMSE on Validation set : 0.20266305799400489 R2 Score on Validation set : 0.9983111290668926
rf = RandomForestRegressor()
rf.fit(x_train,y_train)
print('Attempting to fit Random Forest Regressor')
Attempting to fit Random Forest Regressor
y_pred_val_rf = rf.predict(x_test)
print('MAE on Validation set :',metrics.mean_absolute_error(y_test, y_pred_val_rf))
print("\n")
print('MSE on Validation set :',metrics.mean_squared_error(y_test, y_pred_val_rf))
print("\n")
print('RMSE on Validation set :',np.sqrt(metrics.mean_absolute_error(y_test, y_pred_val_rf)))
print("\n")
print('R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_rf))
print("\n")
MAE on Validation set : 0.05551054663813577 MSE on Validation set : 0.7075218580258164 RMSE on Validation set : 0.2356067627173205 R2 Score on Validation set : 0.9992185627610202
from sklearn.model_selection import cross_val_score
cvs = cross_val_score(rf,x_train,y_train,cv=10,n_jobs=-1)
print('Accuracy: {:.2f} %'.format(cvs.mean()*100))
print('Standard Deviation: {:.2f} %'.format(cvs.std()*100))
Accuracy: 99.91 % Standard Deviation: 0.03 %
plt.scatter(y_test,y_pred_val_rf)
<matplotlib.collections.PathCollection at 0x7f0a7ce3e910>
mlp = MLPRegressor(max_iter = 300)
mlp.fit(x_train, y_train)
print('Attempting to fit MLP Regressor')
Attempting to fit MLP Regressor
y_pred_val_mlp = mlp.predict(x_test)
print('MAE on Validation set :',metrics.mean_absolute_error(y_test, y_pred_val_mlp))
print("\n")
print('MSE on Validation set :',metrics.mean_squared_error(y_test, y_pred_val_mlp))
print("\n")
print('RMSE on Validation set :',np.sqrt(metrics.mean_absolute_error(y_test, y_pred_val_mlp)))
print("\n")
print('R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_mlp))
print("\n")
MAE on Validation set : 28.533542649786032 MSE on Validation set : 1161.6801167679357 RMSE on Validation set : 5.341679759194296 R2 Score on Validation set : -0.283041778465853
gbc = GradientBoostingRegressor()
gbc.fit(x_train, y_train)
print('Attempting to fit Gradient Boosting Regressor')
Attempting to fit Gradient Boosting Regressor
y_pred_val_gbc = gbc.predict(x_test)
print('MAE on Validation set :',metrics.mean_absolute_error(y_test, y_pred_val_gbc))
print("\n")
print('MSE on Validation set :',metrics.mean_squared_error(y_test, y_pred_val_gbc))
print("\n")
print('RMSE on Validation set :',np.sqrt(metrics.mean_absolute_error(y_test, y_pred_val_gbc)))
print("\n")
print('R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_gbc))
print("\n")
MAE on Validation set : 16.177236536913203 MSE on Validation set : 391.49473158977816 RMSE on Validation set : 4.022093551486987 R2 Score on Validation set : 0.5676054970481137
lgbm = LGBMRegressor()
lgbm.fit(x_train, y_train)
print('Attempting to fit Light GBM Regressor')
Attempting to fit Light GBM Regressor
y_pred_val_lgbm = lgbm.predict(x_test)
print('MAE on Validation set :',metrics.mean_absolute_error(y_test, y_pred_val_lgbm))
print("\n")
print('MSE on Validation set :',metrics.mean_squared_error(y_test, y_pred_val_lgbm))
print("\n")
print('RMSE on Validation set :',np.sqrt(metrics.mean_absolute_error(y_test, y_pred_val_lgbm)))
print("\n")
print('R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_lgbm))
print("\n")
MAE on Validation set : 3.281731775269041 MSE on Validation set : 21.739774523220383 RMSE on Validation set : 1.8115550710008905 R2 Score on Validation set : 0.9759890536429906
print('Decision Tree Regressor R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_dc))
print('Random Forest Regressor R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_rf))
print('Light GBM R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_lgbm))
print('Logistic Regression R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_lr))
print('SVR R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_svm))
print('MLP Regressor R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_mlp))
print('Gradient Boosting R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_gbc))
Decision Tree Regressor R2 Score on Validation set : 0.9983111290668926 Random Forest Regressor R2 Score on Validation set : 0.9992185627610202 Light GBM R2 Score on Validation set : 0.9759890536429906 Logistic Regression R2 Score on Validation set : 0.03246551904513406 SVR R2 Score on Validation set : 0.018380507501720822 MLP Regressor R2 Score on Validation set : -0.283041778465853 Gradient Boosting R2 Score on Validation set : 0.5676054970481137